# Generate Dissimilar Titles & Keywords

This notebook generates dissimilar tabs, for training the model to not suggest dissimilar titles.

In [2]:
import sys
from dotenv import load_dotenv
# sys.path.append("")
load_dotenv()

False

In [3]:
import pandas as pd
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
df_dis = pd.read_csv('./data/dissimilar_topics_and_titles.csv')

In [44]:
import random
import numpy as np
import re

MAX_WORDS_PER_TOPIC = 3

def get_keywords(documents, other_documents):
    joined_docs = [re.sub(r'\d+', '', ' '.join(documents)), re.sub(r'\d+', '', ' '.join(other_documents))]
    vectorizer_model = CountVectorizer(stop_words="english")
    vectorizer_model.fit(joined_docs)
    vectorizer_words = vectorizer_model.get_feature_names_out()
    important_word_matrix = vectorizer_model.transform(joined_docs)
    ctfidf_model = ClassTfidfTransformer()
    weighted_important_word_matrix = ctfidf_model.fit_transform(important_word_matrix).toarray()
    sorted_indices = np.argsort(-weighted_important_word_matrix, 1)
    num_words = weighted_important_word_matrix.shape[1]
    num_clusters = weighted_important_word_matrix.shape[0]
    sorted_scores = np.take_along_axis(weighted_important_word_matrix, sorted_indices, axis=1)
    topic_info_list = []
    keyword_list = []
    for cluster in range(num_clusters):
        topic_words = []
        for top_score_ref in range(num_words):
            if sorted_scores[cluster, top_score_ref] > 0.05:
                topic_words.append(vectorizer_words[sorted_indices[cluster, top_score_ref]])
            if len(topic_words) >= MAX_WORDS_PER_TOPIC:
                break
        topic_info_list.append(", ".join(topic_words))
        keyword_list.append(topic_words)
    return keyword_list
    
def generate_dissimilar_documents(df, n=1000):
    # choose 2-4 topics, and then choose 1 document from each
    # there are 10 topics and 50 titles per topic, meaning a total
    # 2 titles - 10C2 * 50^2 +
    # 3 titles - 10C3 * 50^3 +
    # 4 titles - 10C4 * 50^4 +
    # unique rows
    topics = list(df['Topic'].unique())
    topic_to_docs = {topic: list(df[df['Topic'] == topic]['Title']) for topic in topics}
    all_joined_docs = []
    all_joined_keywords = []
    all_labels = []
    for _i in range(n):
        documents = []
        chosen_topics = random.sample(topics, random.randint(2, 4))
        for topic in chosen_topics:
            documents += random.sample(topic_to_docs[topic], 1)
        joined_docs = ' '.join(documents)
        # get a random sample of 0-10 from remaining documents, which could be related to any topic
        # mimics someone adding dissimilar tab to create group and then has other tabs in window
        other_documents = random.sample(list(set(df['Title']).difference(set(documents))), random.randint(0, 10))
        joined_keywords = ','.join(get_keywords(documents, other_documents)[0])
        all_joined_docs.append(joined_docs)
        all_joined_keywords.append(joined_keywords)
        all_labels.append('None')
    df_data = pd.DataFrame({
        'input_titles': all_joined_docs,
        'input_keywords': all_joined_keywords,
        'output': all_labels
    })
    return df_data

In [45]:
df_out = generate_dissimilar_documents(df_dis)

Flight Delays: Passenger Rights Yoga for Beginners: Simple Poses to Try Gaming Accessories: Keyboards and Mice accessories,try,simple
MOOC Forums: Engage with Classmates Fitness Routine: 10-Minute Home Workout Product Hunt: Innovative Startups Featured DIY Chalk Paint: Upcycling Furniture innovative,mooc,minute
Animation Showcase: Upcoming Releases Telegram Channel: Daily News Updates news,telegram,showcase
Reality TV Recap: Most Shocking Eliminations Smart Watches: Fitness and Beyond Group Fitness: Benefits of Exercising Together Small Business Loans: How to Qualify fitness,recap,shocking
Concert Review: Epic Performance in Sold-Out Arena Comparison Chart: Subscription Box Services Productivity Apps: Top Picks for Busy People Working Out at Home: Minimal Equipment Needed concert,picks,services
In-Depth: Effects of Rising Sea Levels Wreath Making: Seasonal Door Decor wreath,seasonal,sea
Hostel Reviews: Finding Budget Accommodation Buyer’s Guide: Best Laptops for Students Wreath Making:

In [46]:
df_out.head()

Unnamed: 0,input_titles,input_keywords,output
0,Flight Delays: Passenger Rights Yoga for Begin...,"accessories,try,simple",
1,MOOC Forums: Engage with Classmates Fitness Ro...,"innovative,mooc,minute",
2,Animation Showcase: Upcoming Releases Telegram...,"news,telegram,showcase",
3,Reality TV Recap: Most Shocking Eliminations S...,"fitness,recap,shocking",
4,Concert Review: Epic Performance in Sold-Out A...,"concert,picks,services",


In [15]:
df_out.to_csv('./test_data/dissimilar_topics_and_keywords_01_10.csv')